textdata <- read.csv( "/Users/carolineyu/Documents/1111/course_materials/Exercises/09_kickstarter/kickstarter_projects_2020_02.csv")
library(tm)
library(quanteda)
library(tidytext)
library(dplyr)
library(leaflet)
library(plotrix)
library(ggplot2)
library(RColorBrewer)
library(maptools)
library(plyr)
library(tidyverse)
library(tidytext)
text_backer <- textdata %>%
group_by(top_category, state) %>%
dplyr::summarise(n = n())
text_backer
library(ggplot2)
p1 <- ggplot(text_backer) + geom_bar(aes(x = as.factor(state), y = n,fill = as.factor(state)),stat = "identity") + facet_wrap(as.factor(text_backer$top_category)) + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
p1
#### From the plot p, we can tell that category Music has the most projects whose state is successful. P2 gives more straight-forward information.
text_backer1 <- textdata %>%
filter(state == "successful") %>%
group_by(top_category) %>%
dplyr::summarise(n = n())
p2 <- ggplot(text_backer1) + geom_bar(aes(x = top_category, y = n, fill = as.factor(top_category)),stat = "identity")
p2
#### I would also use achievement_ratio to define the success of projects.
achievement_ratio <- textdata %>%
filter(state == "successful") %>%
mutate(ach_ra = pledged / goal * 100) %>%
group_by(top_category) %>%
na.omit(ach_ra) %>%
select(top_category, ach_ra)
achievement_ratio
box1 <- ggplot(achievement_ratio) + geom_boxplot(aes(x = top_category, y = ach_ra, fill = as.factor(top_category)), outlier.colour = NA) + coord_trans(x = "identity", y = "identity", xlim = NULL, ylim = c(0,1e+03)) + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + labs(x = "Category", y = "Achievement Ratio")+labs(fill = "category")
box1
#### Excluding the outliers, the project with largest achievement ratio is in the design category. And the projects in theater has the largest mean value of achievement ratio among categories.
success_prjects <- textdata %>%
filter(state == "successful") %>%
group_by(location_state) %>%
dplyr::summarise(n = n()) %>%
arrange(desc(n))
success_prjects
American_map <-readShapePoly("/Users/carolineyu/Downloads/USA_map/STATES.SHP")
AD1 <- American_map@data
AD2 <- data.frame(id=rownames(AD1),AD1)
American_map1 <- fortify(American_map)
American_map_data <- join(American_map1,AD2, type = "full")
American_map_data<-American_map_data[,1:12]
mydata<-data.frame(STATE_NAME=unique(American_map_data$STATE_NAME),STATE_ABBR=unique(American_map_data$STATE_ABBR))
data1<-subset(American_map_data,STATE_NAME!='Alaska'& STATE_NAME!='Hawaii')
data2<-subset(American_map_data,STATE_NAME=="Hawaii")
data3<-subset(American_map_data,STATE_NAME=="Alaska")
data2$long<-data2$long+65
data3$long<-data3$long+40
data3$lat<-data3$lat-42
data4<-rbind(data1,data2,data3)
colnames(success_prjects) <- c("STATE_ABBR","n")
American_data <- join(data4, success_prjects, by= ,type="full")
midpos <- function(AD1){mean(range(AD1,na.rm=TRUE))}
centres <- ddply(American_data,.(STATE_ABBR),colwise(midpos,.(long,lat)))
mynewdata<-join(centres,success_prjects,by = "STATE_ABBR", type="full")
content <- paste("State:", mynewdata$STATE_ABBR,"<br/c>",
"Number of Successful Projects:",mynewdata$n,"<br/c>")
pal = colorFactor("Set1", domain = mynewdata$STATE_ABBR)
color_pro = pal(mynewdata$STATE_ABBR)
library(leaflet)
m <- leaflet(mynewdata) %>%
addTiles()
m1 <- m %>%
addCircleMarkers(lng = ~long, lat = ~lat, color = color_pro,
popup = content)
m1
text1 <- textdata %>%
filter(state == "successful") %>%
mutate(ach_ra = pledged / goal * 100)%>%
arrange(desc(ach_ra))
text_success <- text1 %>%
head(1000)
scs_corpus <- VCorpus(VectorSource(text_success$blurb))
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)}
scs_corpus_clean <- clean_corpus(scs_corpus)
scs_stemmed <- tm_map(scs_corpus_clean, stemDocument)
scs_dtm <- DocumentTermMatrix(scs_stemmed)
scs_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 3665)>>
## Non-/sparse entries: 10430/3654570
## Sparsity : 100%
## Maximal term length: 36
## Weighting : term frequency (tf)
scs_td <- tidy(scs_dtm)
scs_tf_idf <- scs_td %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
scs_tf_idf
library(wordcloud)
set.seed(12345)
purple_orange <- brewer.pal(10, "PuOr")
purple_orange <- purple_orange[-(2:8)]
wordcloud(scs_tf_idf$term, scs_tf_idf$tf, max.words = 100, colors = purple_orange )
text2 <- textdata %>%
filter(state == "failed") %>%
mutate(ach_ra = pledged / goal * 100)%>%
arrange(desc(ach_ra))
text_unsuccess <- text2 %>%
head(1000)
all_failed <- paste(text_unsuccess$blurb, collapse = "")
all_success <- paste(text_success$blurb, collapse = "")
all_proj <- c(all_failed, all_success)
all_proj <- VectorSource(all_proj)
all_corpus <- VCorpus(all_proj)
all_clean <- clean_corpus(all_corpus)
all_tdm <- TermDocumentMatrix(all_clean)
all_m <- as.matrix(all_tdm)
# Clean the corpus
all_clean <- clean_corpus(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_clean)
# Give the columns distinct names
colnames(all_tdm) <- c("failed", "success")
# Create all_m
all_m <- as.matrix(all_tdm)
common_words <- subset(
all_m,
all_m[, 1] > 0 & all_m[, 2] > 0
)
head(common_words)
## Docs
## Terms failed success
## able 3 3
## access 2 5
## accessible 2 1
## accompanying 2 1
## accomplish 1 1
## accuracy 2 1
difference <- abs(common_words[, 1] - common_words[, 2])
common_words <- cbind(common_words, difference)
common_words <- common_words[order(common_words[, 3],
decreasing = T), ]
head(common_words)
## failed success difference
## enamel 18 63 45
## help 61 19 42
## game 22 53 31
## pins 11 34 23
## pin 6 28 22
## book 47 28 19
top25_df <- data.frame(x = common_words[1:20, 1],
y = common_words[1:20, 2],
labels = rownames(common_words[1:20, ]))
# The plotrix package has been loaded
# Make pyramid plot
pyramid.plot(top25_df$x, top25_df$y,
labels = top25_df$labels,
main = "Words in Common",
gap = 18,
laxlab = NULL,
raxlab = NULL,
unit = NULL,
lxcol = "steelblue",
rxcol = "darkgreen",
top.labels = c("Failed",
"Words",
"Successful")
)
## [1] 5.1 4.1 4.1 2.1
library(quanteda)
library(dplyr)
FRE_textdata_success <- textstat_readability(as.character(text_success$blurb),measure = c('Flesch.Kincaid'))
FRE_textdata_unsuccess <- textstat_readability(as.character(text_unsuccess$blurb),measure = c('Flesch.Kincaid'))
FRE_textdata_success1 <- FRE_textdata_success %>%
mutate(state = "success")
FRE_textdata_unsuccess1 <- FRE_textdata_unsuccess %>%
mutate(state = "failed")
FRE_textdata1 <- rbind(FRE_textdata_success1, FRE_textdata_unsuccess1)
p_readibility <- ggplot() + geom_boxplot(aes(y = Flesch.Kincaid, x = as.factor(state), fill = as.factor(state)), data = FRE_textdata1) + labs(x = "Category", y = "Flesch.Kincaid")+labs(fill = "Success or Failed")
p_readibility
From the box plot, we can tell that the successful projects is slightly hard to understand compared to the projects that are failed.
Calculate the tone of each text based on the positive and negative words that are being used. You can rely on the Hu & Liu dictionary provided in lecture or use the Bing dictionary contained in the tidytext package (tidytext::sentiments). Visualize the relationship between tone of the document and success. Briefly comment.
pos <- read.table("/Users/carolineyu/Documents/1111/course_materials/Lectures/Week09/data/dictionaries/positive-words.txt", as.is=T)
neg <- read.table("/Users/carolineyu/Documents/1111/course_materials/Lectures/Week09/data/dictionaries/negative-words.txt", as.is=T)
sentiment <- function(words=c("really great good stuff bad")){
require(quanteda)
tok <- quanteda::tokens(words)
pos.count <- sum(tok[[1]]%in%pos[,1])
neg.count <- sum(tok[[1]]%in%neg[,1])
out <- (pos.count - neg.count)/(pos.count+neg.count)
return(out)}
sucess_tone <- as.data.frame(text_success$blurb)
sucess_tone <- mutate(sucess_tone, tone = NA)
sucess_tone <- mutate(sucess_tone,ach_ra = text_success$ach_ra)
colnames(sucess_tone) <- c("text","tone","ach_ra")
since the row dataset is too large, thus I decide to look at the 1000 documents in success and failed category
for (i in 1:nrow(sucess_tone)){
sucess_tone$tone[i] <- sentiment(as.character(sucess_tone$text[i]))
}
unsucess_tone <- as.data.frame(text_unsuccess$blurb)
unsucess_tone <- mutate(unsucess_tone, tone = NA)
unsucess_tone <- mutate(unsucess_tone,ach_ra = text_unsuccess$ach_ra)
colnames(unsucess_tone) <- c("text","tone","ach_ra")
since the row dataset is too large, thus I decide to look at the 1000 documents in success and failed category
for (i in 1:nrow(unsucess_tone)){
unsucess_tone$tone[i] <- sentiment(as.character(unsucess_tone$text[i]))
}
sucess_tone1 <- sucess_tone %>%
mutate(state = "successful")
unsucess_tone1 <- unsucess_tone%>%
mutate(state = "failed")
q3a <- rbind(unsucess_tone1,sucess_tone1)
plot1 <- ggplot(q3a)+geom_point(aes(x = tone , y = ach_ra, color = state)) + ylim(0,2e+06) + labs("The relation between achievement ratio and document tone")
plot1
From this graph, we can easily tell that for the projects which is successful, have larger document tones. However,for projects which is failed, though some have large document tone, the achievement ratio are all zero.
pos <- pos %>%
mutate(sentiment = "Positive")
neg <- neg %>%
mutate(sentiment = "Negative")
senti <- rbind(pos, neg)
colnames(senti) <- c("word","sentiment")
textdata1 <- tibble(text = as.character(q3a$text)) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word") %>%
inner_join(senti, by = "word") %>%
dplyr::count(word, sentiment,sort = TRUE) %>%
ungroup()
library(reshape2)
textdata1 %>%
acast(word~sentiment, value.var= "n", fill = 0) %>%
comparison.cloud(colors = c("steelblue","darkgreen"),
max.words = 100)
sucess_tone2 <- tibble(text = as.character(sucess_tone1$text)) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word") %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
dplyr::count(word, sentiment,sort = TRUE)
sucess_tone2 <- sucess_tone2 %>%
mutate(category = "Successful")
unsucess_tone2 <- tibble(text = as.character(unsucess_tone1$text)) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word") %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
dplyr::count(word, sentiment,sort = TRUE)
unsucess_tone2 <- unsucess_tone2 %>%
mutate(category = "Failed")
all_emotion_tone <- rbind(sucess_tone2,unsucess_tone2)
all_emotion_tone
success_emotion <- all_emotion_tone %>%
filter(category == "Successful") %>%
head(50)
unsuccess_emotion <- all_emotion_tone %>%
filter(category == "Failed") %>%
head(50)
emo1 <- ggplot(success_emotion) + geom_col(mapping = aes(x = word, y = n, fill = sentiment))+ facet_wrap(~sentiment, scales = "free_y") + labs(y = "Contribution to sentiment", x = NULL, title = " For successful projects") + coord_flip()
emo1
emo2 <- ggplot(unsuccess_emotion) + geom_col(mapping = aes(x = word, y = n, fill = sentiment))+ facet_wrap(~sentiment, scales = "free_y") + labs(y = "Contribution to sentiment", x = NULL, title = " For failed projects") + coord_flip()
emo2
In these to graph, we can find out that the most frequent words that appeared in the successful projects is arts, which represents anticipation, joy, sadness and surprise. We can find that for most words in Failed projects and successful projects, the frequent words in each sentiment are the same. However, the frequency is different.